{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "2d36e76e",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import os\n",
    "import librosa\n",
    "import soundfile as sf\n",
    "from tqdm import tqdm\n",
    "from multiprocess import Pool\n",
    "import itertools\n",
    "\n",
    "def chunks(l, n):\n",
    "    for i in range(0, len(l), n):\n",
    "        yield (l[i: i + n], i // n)\n",
    "\n",
    "def multiprocessing(strings, function, cores=6, returned=True):\n",
    "    df_split = chunks(strings, len(strings) // cores)\n",
    "    pool = Pool(cores)\n",
    "    pooled = pool.map(function, df_split)\n",
    "    pool.close()\n",
    "    pool.join()\n",
    "\n",
    "    if returned:\n",
    "        return list(itertools.chain(*pooled))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "39676150",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('CompA-R.json') as fopen:\n",
    "    train = json.load(fopen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "2ad20375",
   "metadata": {},
   "outputs": [],
   "source": [
    "!mkdir compa_r_train_audios-mp3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "aae125ba",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "198648"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "b5f0e6f3",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'instruction': 'Analyze the frequency and duration of the revving sounds in the audio. Based on these characteristics, infer the type of vehicle producing these sounds.',\n",
       " 'output': 'The frequent and lengthy revving sounds suggest a powerful vehicle, likely a race car or motorcycle, which fits the context of a race car event.',\n",
       " 'audio_id': './compa_r_train_audios/YBaw0jIZ0STo.wav',\n",
       " 'input': '',\n",
       " 'dataset': 'Audioset_Strong',\n",
       " 'task': 'open-ended question'}"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "de81c58b",
   "metadata": {},
   "outputs": [],
   "source": [
    "def loop(rows):\n",
    "    rows, _ = rows\n",
    "    data = []\n",
    "    for r in tqdm(rows):\n",
    "        if not os.path.exists(r['audio_id']):\n",
    "            continue\n",
    "        audio_filename = os.path.join(\n",
    "            'compa_r_train_audios-mp3', \n",
    "            os.path.split(r['audio_id'])[1].replace('.wav', '.mp3'))\n",
    "        \n",
    "        if not os.path.exists(audio_filename):\n",
    "            y, sr = librosa.load(r['audio_id'], sr = 16000)\n",
    "            sf.write(audio_filename, y, sr)\n",
    "        \n",
    "        data.append({\n",
    "            'question': r['instruction'],\n",
    "            'answer': r['output'],\n",
    "            'audio_filename': audio_filename,\n",
    "            'metadata': json.dumps(r),\n",
    "        })\n",
    "    return data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "107e9772",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|███████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 55.16it/s]\n"
     ]
    }
   ],
   "source": [
    "processed = loop((train[:10], 0))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "5d949a69",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|███████████████████████████████████████████████████████████████████████████████████| 9932/9932 [02:33<00:00, 64.68it/s]\n",
      "100%|██████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:00<00:00, 58052.65it/s]\n",
      "100%|███████████████████████████████████████████████████████████████████████████████████| 9932/9932 [02:42<00:00, 61.22it/s]\n",
      "100%|███████████████████████████████████████████████████████████████████████████████████| 9932/9932 [02:42<00:00, 61.08it/s]\n",
      "100%|███████████████████████████████████████████████████████████████████████████████████| 9932/9932 [02:44<00:00, 60.21it/s]\n",
      "100%|███████████████████████████████████████████████████████████████████████████████████| 9932/9932 [02:53<00:00, 57.39it/s]\n",
      "100%|███████████████████████████████████████████████████████████████████████████████████| 9932/9932 [02:54<00:00, 56.85it/s]\n",
      "100%|███████████████████████████████████████████████████████████████████████████████████| 9932/9932 [03:04<00:00, 53.90it/s]\n",
      "100%|███████████████████████████████████████████████████████████████████████████████████| 9932/9932 [03:08<00:00, 52.75it/s]\n",
      "100%|███████████████████████████████████████████████████████████████████████████████████| 9932/9932 [03:18<00:00, 50.16it/s]\n",
      "100%|███████████████████████████████████████████████████████████████████████████████████| 9932/9932 [03:21<00:00, 49.25it/s]\n",
      "100%|███████████████████████████████████████████████████████████████████████████████████| 9932/9932 [03:21<00:00, 49.23it/s]\n",
      "100%|███████████████████████████████████████████████████████████████████████████████████| 9932/9932 [03:21<00:00, 49.20it/s]\n",
      "100%|███████████████████████████████████████████████████████████████████████████████████| 9932/9932 [03:21<00:00, 49.30it/s]\n",
      "100%|███████████████████████████████████████████████████████████████████████████████████| 9932/9932 [03:24<00:00, 48.68it/s]\n",
      "100%|███████████████████████████████████████████████████████████████████████████████████| 9932/9932 [03:22<00:00, 49.02it/s]\n",
      "100%|███████████████████████████████████████████████████████████████████████████████████| 9932/9932 [03:21<00:00, 49.38it/s]\n",
      "100%|███████████████████████████████████████████████████████████████████████████████████| 9932/9932 [03:21<00:00, 49.26it/s]\n",
      "100%|███████████████████████████████████████████████████████████████████████████████████| 9932/9932 [03:22<00:00, 49.02it/s]\n",
      "100%|███████████████████████████████████████████████████████████████████████████████████| 9932/9932 [03:22<00:00, 49.13it/s]\n",
      "100%|███████████████████████████████████████████████████████████████████████████████████| 9932/9932 [03:22<00:00, 49.02it/s]\n"
     ]
    }
   ],
   "source": [
    "processed = multiprocessing(train, loop, cores = 20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "1a8de73d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2.9G\tcompa_r_train_audios-mp3\r\n"
     ]
    }
   ],
   "source": [
    "!du -hs compa_r_train_audios-mp3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "226e1158",
   "metadata": {},
   "outputs": [],
   "source": [
    "!rm -rf compa_r_train_audios"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "30409c34",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "198648"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(processed)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "0c54011c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'question': \"The sounds of keypress tones and generic impact sounds are interspersed through the audio. Analyze these and infer what they might indicate about the telephone operation and the speaker's actions during the call.\",\n",
       " 'answer': 'The speaker might be navigating through automated menus before and during the call, indicated by the keypress tones. The generic impact sounds could signify basic desk-related activities, like pen-clicking or paper shuffling.',\n",
       " 'audio_filename': 'compa_r_train_audios-mp3/Y349kbyfz0qU.mp3',\n",
       " 'metadata': '{\"instruction\": \"The sounds of keypress tones and generic impact sounds are interspersed through the audio. Analyze these and infer what they might indicate about the telephone operation and the speaker\\'s actions during the call.\", \"output\": \"The speaker might be navigating through automated menus before and during the call, indicated by the keypress tones. The generic impact sounds could signify basic desk-related activities, like pen-clicking or paper shuffling.\", \"audio_id\": \"./compa_r_train_audios/Y349kbyfz0qU.wav\", \"input\": \"\", \"dataset\": \"Audioset_Strong\", \"task\": \"open-ended question\"}'}"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "processed[10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "8904e6da",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "494"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "with open('CompA-R-test.json') as fopen:\n",
    "    test = json.load(fopen)\n",
    "    \n",
    "len(test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "2f5736de",
   "metadata": {},
   "outputs": [],
   "source": [
    "!mkdir compa_r_test_audios-mp3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "id": "861fb372",
   "metadata": {},
   "outputs": [],
   "source": [
    "def loop(rows):\n",
    "    rows, _ = rows\n",
    "    data = []\n",
    "    for r in tqdm(rows):\n",
    "        f = os.path.join('filtered_audios', r['audio_id'])\n",
    "        if not os.path.exists(f):\n",
    "            continue\n",
    "        audio_filename = os.path.join(\n",
    "            'compa_r_test_audios-mp3', \n",
    "            os.path.split(r['audio_id'])[1].replace('.wav', '.mp3'))\n",
    "        \n",
    "        if not os.path.exists(audio_filename):\n",
    "            y, sr = librosa.load(f, sr = 16000)\n",
    "            sf.write(audio_filename, y, sr)\n",
    "        \n",
    "        data.append({\n",
    "            'question': r['instruction_output'][0]['instruction'],\n",
    "            'answer': r['instruction_output'][0]['output'],\n",
    "            'audio_filename': audio_filename,\n",
    "            'metadata': json.dumps(r),\n",
    "        })\n",
    "    return data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "id": "60d72b10",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████████████████████████████████████████████████████████████████| 494/494 [00:33<00:00, 14.78it/s]\n"
     ]
    }
   ],
   "source": [
    "test_processed = loop((test, 0))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "id": "d4f560c7",
   "metadata": {},
   "outputs": [],
   "source": [
    "from datasets import Dataset\n",
    "\n",
    "dataset = Dataset.from_list(processed)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "id": "0227124f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "7478a879846443f9a37c8a4f89b1a85c",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "e832ec7329734688b7df0b64a231b3b7",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating parquet from Arrow format:   0%|          | 0/199 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "3bf634fb08564a2fa105c8e0cbe466ec",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Uploading...:   0%|          | 0.00/60.9M [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "CommitInfo(commit_url='https://huggingface.co/datasets/mesolitica/CompA-R-Instructions/commit/6a515fee92b73408de769166f3984836967227a2', commit_message='Upload dataset', commit_description='', oid='6a515fee92b73408de769166f3984836967227a2', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/mesolitica/CompA-R-Instructions', endpoint='https://huggingface.co', repo_type='dataset', repo_id='mesolitica/CompA-R-Instructions'), pr_revision=None, pr_num=None)"
      ]
     },
     "execution_count": 46,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset.push_to_hub('mesolitica/CompA-R-Instructions', split = 'train')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "id": "e1df099d",
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset = Dataset.from_list(test_processed)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "id": "af0b4d3c",
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "ba0146bba91340b7a8d99db0990ef634",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "0e0a7686b1a2421db55619549e43ca06",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "8de6430d152b4d98aea45b6ec942096b",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Uploading...:   0%|          | 0.00/415k [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "758f802c3a7043cd97c859cfa776d8a6",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "README.md:   0%|          | 0.00/400 [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "CommitInfo(commit_url='https://huggingface.co/datasets/mesolitica/CompA-R-Instructions/commit/ce9bb7b9a98474f68f8bf2c1d08f1a1a16311acd', commit_message='Upload dataset', commit_description='', oid='ce9bb7b9a98474f68f8bf2c1d08f1a1a16311acd', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/mesolitica/CompA-R-Instructions', endpoint='https://huggingface.co', repo_type='dataset', repo_id='mesolitica/CompA-R-Instructions'), pr_revision=None, pr_num=None)"
      ]
     },
     "execution_count": 48,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset.push_to_hub('mesolitica/CompA-R-Instructions', split = 'test')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "2b7a934a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "62613"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from glob import glob\n",
    "\n",
    "audio_files = glob('compa_r_*_audios-mp3/*.mp3')\n",
    "len(audio_files)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "9d11fa5d",
   "metadata": {},
   "outputs": [],
   "source": [
    "import zipfile\n",
    "\n",
    "with zipfile.ZipFile('compa_r.zip', 'w', zipfile.ZIP_DEFLATED) as zipf:\n",
    "    for f in audio_files:\n",
    "        zipf.write(f, arcname=f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5772bcdb",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Uploading files using Xet Storage..\n",
      "Uploading...:  99%|████████████████████████▊| 2.87G/2.89G [00:20<00:00, 574MB/s]"
     ]
    }
   ],
   "source": [
    "!huggingface-cli upload mesolitica/CompA-R-Instructions compa_r.zip --repo-type=dataset"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
